Setup
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.4.3
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## Warning: package 'circlize' was built under R version 4.0.3
## ========================================
## circlize version 0.4.11
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
Overall Matrix agreement subsetted for cell lines ang genes in common and order.
RPPA? Plot in double matrix of transcripromis vs proteomics correlations ordered by pathway?
Part of of AA - > superimpose subpathways IDEAS - lineage per heatmap - RNAseq quartile vs protein levels mapping -
# load(file = "./../Project_Datasets/CCLE_OMICS.rda")
# load(file = "./../Project_Datasets/NCI60_OMICS.rda")
# load(file = "./../Project_Datasets/Metabolic_Pathway_annotations.RData")
#
# HUMAN_9606_idmapping <- read_tsv("./../Project_Datasets/HUMAN_9606_idmapping.dat",
# col_names = FALSE) %>%
# setNames(c("Uniprot", "Type", "ID"))
load(file = "./Project_Datasets/CCLE_OMICS.rda")
load(file = "./Project_Datasets/NCI60_OMICS.rda")
load(file = "./Project_Datasets/Metabolic_Pathway_annotations.RData")
#Derms <- openxlsx::read.xlsx("./../Project_Datasets/Lineage_tissue_types.xlsx", cols = c(1,6)) %>% na.omit()
Derms <- openxlsx::read.xlsx("./Project_Datasets/Lineage_tissue_types.xlsx", cols = c(1,6)) %>% na.omit()
## the file humankegg_df if currently used for subpathways instead of this
#Ordering SMPDB by pathway count and removing duplicates.
#This allows proteins to be in the most abundance pathway that they are in
# SMPDB <- SMPDB_pathways %>% group_by(Pathway) %>%
# summarise(count=n()) %>%
# right_join(SMPDB_pathways, by = 'Pathway') %>%
# arrange(-count) %>%
# subset(!duplicated(Gene_Name)) %>% .[,-c(2:4,6:9,11)] %>%
# pivot_longer(-Pathway, names_to = "Type", values_to = "ID") %>%
# setNames(c("Subpathway", "Type","ID"))
#Mapping of Uniprots to all others sort of identifiers
HUMAN_9606_idmapping <- read_tsv("./Project_Datasets/HUMAN_9606_idmapping.dat",
col_names = FALSE) %>%
setNames(c("Uniprot", "Type", "ID"))
#subset hsa KEGG to Gene name
HUMAN_9606_idmapping_hsa <- HUMAN_9606_idmapping[str_detect(HUMAN_9606_idmapping$ID, "hsa:"),-2] %>%
left_join(HUMAN_9606_idmapping[HUMAN_9606_idmapping$Type == "Gene_Name", -2], by = "Uniprot") %>%
na.omit() %>% .[,-1] %>% setNames(c("KEGG_ID", "Gene.names"))
#pairwise correlation function used in main function
Correlation_pairwise <- function(x,y){
cor(x,y,use = "pairwise", method = "pearson")
}
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[lower.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
#compare 2 datasets with common Feature (rows) and Sample names (Columns)
### Change rerun value
Dataset_correlation <- function(Dataset_1,Dataset_2){
# type = c("Transcriptomics",
# "Proteomic",
# "Metabolomics")
# type <- "Transcriptomics"
# Type <- case_when(
# type == "Transcriptomics" ~ "Gene_name",
# type == "Proteomic" ~ "Uniprot",
# type == "Metabolomics" ~ "Metabolites",
# TRUE ~ as.character(type))
#Dataset_1 = RNA_seq
#Dataset_2 = NCI_60_RNA
#Dataset_1 = CCLE_proteins
#Dataset_2 = NCI_60_proteins
#Dataset_1 = CCLE_Met_Common
#Dataset_2 = NCI_60_metabolites_common
Comparison_type <- paste(deparse(substitute(Dataset_1)),deparse(substitute(Dataset_2)))
Dataset1 <- as.data.frame(Dataset_1)
Dataset2 <- as.data.frame(Dataset_2)
Common_Cell_lines <- intersect(colnames(Dataset1),colnames(Dataset2)) #Common Samples
Common_proteins <- intersect(rownames(Dataset1),rownames(Dataset2)) #Common Features
Master_pathways <- Master_pathways[!duplicated(Master_pathways$ID),]
##subsetting for specific master pathways
# Master_pathways <- Master_pathways %>%
# subset(Pathway %in% c("Amino acid","Nucleotide","TCA cycle"))
Master_pathways_ordered <- Master_pathways[Master_pathways$ID %in% Common_proteins,] %>%
left_join(humankegg_df, by = c("ID" = "Uniprot"))%>% .[order(.$Pathway, .$SubPathway),] %>%
subset(!duplicated(ID))
#left_join(Gene_pathways[,c("Uniprot","SubPathway")], by = c("ID" = "Uniprot")) %>% subset(!duplicated(ID))
Master_pathways_ordered[is.na(Master_pathways_ordered)] <- "Not Known"
#test that Dataset1[1,1] =! Dataset2 [1,1]
Subset_and_matrix <- function(x){
#x <- Dataset1
#subsetting and ordering the Datasets
x_sub <- x %>% .[rownames(.) %in% Common_proteins, colnames(.) %in% Common_Cell_lines]%>%
.[match(Common_proteins,rownames(.)),match(Common_Cell_lines,colnames(.))]
#print(deparse(substitute(x)))
cormat <- round(cor(t(x_sub[rownames(x_sub) %in% Master_pathways_ordered$ID,])),2) %>%
.[match(Master_pathways_ordered$ID,row.names(.)),match(Master_pathways_ordered$ID,row.names(.))] %>%
{if(x[1,1] == Dataset_1[1,1]) get_lower_tri(.) else get_upper_tri(.)}
cormat[is.na(cormat)] <- 0
list(matrix = cormat,
df = x_sub)
#cormat
}
list1 <- Subset_and_matrix(Dataset_1)
list2 <- Subset_and_matrix(Dataset_2)
Dataset1_sub <- list1$df %>% as.data.frame()
Dataset2_sub <- list2$df %>% as.data.frame()
#Combined_df <- rbind(cormat1,cormat2)
# #Combined_df$value[Combined_df$Var1 == Combined_df$Var2] <- NA
#
# Feature_order <- Combined_df[!duplicated(Combined_df$Var1),] %>%
# .[order(.$Pathway),"Var1"]
#
# ### ComplexHeatMAP
# test1 <- Master_pathways_ordered[order(Master_pathways_orderezd$Uniprot),]
Metabolic_matrix <- as.data.frame(list1$matrix+list2$matrix) %>% as.matrix()
set.seed(1587) # to set random generator seed
cl <- colors(distinct = TRUE)
Master_Pathway_colours <- sample(cl, length(unique(Master_pathways_ordered$Pathway))) %>% setNames(unique(Master_pathways_ordered$Pathway))
Sub_Pathway_colours <- sample(cl, length(unique(Master_pathways_ordered$SubPathway))) %>% setNames(unique(Master_pathways_ordered$SubPathway))
Metabolic_matrix[Metabolic_matrix ==2] <- 1
ha = HeatmapAnnotation(Master_path = Master_pathways_ordered$Pathway,
Sub_path = Master_pathways_ordered$SubPathway,
col = list(Master_path = Master_Pathway_colours,
Sub_path = Sub_Pathway_colours))
side = rowAnnotation(Master_path = Master_pathways_ordered$Pathway,
Sub_path = Master_pathways_ordered$SubPathway,
col = list(Master_path = Master_Pathway_colours,
Sub_path = Sub_Pathway_colours))
hmap <- Heatmap(Metabolic_matrix, name = Comparison_type,
top_annotation = ha ,
left_annotation = side,
cluster_rows = F, cluster_columns = F, show_column_names = F, show_row_names = F,
row_title = paste0("top = ", deparse(substitute(Dataset_1))),
column_title = paste0("bottom = ", deparse(substitute(Dataset_2))),
column_title_side = "bottom")
# ### GGPLOT
# Combined_df %>%
# subset(Var1 %in% Master_pathways$Uniprot & Var2 %in% Master_pathways$Uniprot) %>% na.omit() %>%
# ggplot(aes(factor(Var2, levels = Feature_order), factor(Var1,levels = Feature_order), fill = value))+
# geom_tile(color = "white")+
# scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "black",
# midpoint = 0, limit = c(-1,1), space = "Lab",
# name="Pearson\nCorrelation") +
# theme_minimal()+
# theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
# coord_fixed() + labs(x = deparse(substitute(Dataset_1)),deparse(substitute(Dataset_2)))#+
# facet_grid(Pathway~., scales = "free_x")
#Correlating corresponding Samples Correlation of only paired values which allows for NA values
Cell_line_correlation <- map2_dbl(
Dataset1_sub,
Dataset2_sub,
Correlation_pairwise)
Cell_line_perm <- purrr::rerun(10,
map2_dbl(
Dataset1_sub[,sample(ncol(Dataset1_sub))],#Shuffle column-wise and performing sample correlation as above
Dataset2_sub,
Correlation_pairwise)) %>% unlist()
gene_correlation <- pmap(
list(pmap(Dataset1_sub, c, use.names = T), # unlisting feature rows into vectors into lists and correlating
pmap(Dataset2_sub, c, use.names = T)),
Correlation_pairwise) %>% unlist() %>% setNames(Common_proteins)
gene_correlation_perm <- purrr::rerun(10, #permutation by row shuflfing and correlation
pmap(list(pmap(Dataset1_sub[sample(nrow(Dataset1_sub)),],
c, use.names = T), #Shuffle row-wise
pmap(Dataset2_sub, c, use.names = T)),
Correlation_pairwise) %>% unlist()) %>% unlist()
# Imputing both datasets with Media to derive overall correlation
Dataset1_sub_imp <- naniar::impute_median_all(Dataset1_sub)
Dataset2_sub_imp <- naniar::impute_median_all(Dataset2_sub)
Matri_Corr <- MatrixCorrelation::r1(Dataset1_sub_imp,Dataset2_sub_imp) # whole matrix correlation
Matrix_perm <- purrr::rerun(10, #whole matrix permutation of Dataset 1 col-wise shuffle and Dataset 2 row-wise shuffle and correlation
MatrixCorrelation::r1(Dataset1_sub_imp[,sample(ncol(Dataset1_sub_imp))],
Dataset2_sub_imp[sample(nrow(Dataset2_sub_imp)),])) %>%
unlist() # permutation
Hist_plot <- function(Data,Perm){
#Data = Cell_line_correlation
#Perm = Cell_line_perm
Sample_Corr <- data.frame(Samples = c(Data,unlist(Perm)),
Type = rep(c("Data","Perm"), times = c(length(Data),length(unlist(Perm)))))
p <- ggplot2::ggplot(Sample_Corr,aes(Samples)) +
geom_histogram(aes(y=..density..,fill = Type ), # Histogram with density instead of count on y-axis
binwidth=.01,alpha=0.1) +
geom_density(alpha=.5, aes(colour = Type))+
ggtitle(paste(Comparison_type,deparse(substitute(Data))))
}
Feature_quartiles <- rowMeans(Dataset2_sub, na.rm = T) %>% data.frame(Feature = names(.),
Mean_Abundance = .) %>%
mutate(quartile = as.factor(ntile(Mean_Abundance, 4))) %>% left_join(stack(gene_correlation), by = c("Feature" = "ind"))
Feature_expression_cor_plot <- ggplot(Feature_quartiles, aes(values,after_stat(density), colour =quartile)) +
geom_freqpoly(binwidth = 0.1) +
ggtitle(paste("Feature Expression",Comparison_type))
Sample_lineage <- Cell_line_correlation %>% data.frame(Sample = names(.), Correlation = .) %>%
left_join(sample_info[,c("stripped_cell_line_name", "lineage")], by = c("Sample" = "stripped_cell_line_name")) %>%
left_join(Derms, by = c("lineage" = "Cancer_Lineage" ))
Sample_lineage_cor_plot <- ggplot(Sample_lineage, aes(Correlation,after_stat(density), colour = Derm)) +
geom_freqpoly(binwidth = 0.01) +
ggtitle(paste("Sample Lineage",Comparison_type))
list(Sample_Corr = list(Sample_lineage = Sample_lineage,
Cell_line_perm = Cell_line_perm,
Histogram = Hist_plot(Cell_line_correlation,Cell_line_perm),
Sample_lin_cor_plot = Sample_lineage_cor_plot),
Feature_Corr = list(Features_quart = Feature_quartiles,
Gene_corr_perm = gene_correlation_perm,
Histogram = Hist_plot(gene_correlation, gene_correlation_perm),
Feature_cor_plot = Feature_expression_cor_plot),
Matrix_Corr= list(Matri_Corr = Matri_Corr,
Matrix_perm = Matrix_perm,
Histogram = Hist_plot(Matri_Corr,Matrix_perm)),
Heatmap= list(Dataset1_matri = list1$matrix,
Dataset2_matri = list2$matrix,
Plot_Heatmap = hmap))
}
### Metabolic Comparison ###
#Metabolite_mapping <- read.csv("./../Project_Datasets/CCLE_metabolites_mapped.csv")
Metabolite_mapping <- read.csv("./Project_Datasets/CCLE_metabolites_mapped.csv")
NCI_60_metabolites <- NCI_60_metabolites %>% separate_rows(Annotation.ID, sep = " ; ") %>%
subset(str_detect(Annotation.ID, "^H|C") & !duplicated(Annotation.ID)) %>%
mutate(Annotation.ID = str_replace_all(Annotation.ID,"HMDB", "HMDB00"))
Common_Metabolites <- Metabolite_mapping[(Metabolite_mapping$KEGG %in% (NCI_60_metabolites$Annotation.ID)) |
(Metabolite_mapping$HMDB %in% (NCI_60_metabolites$Annotation.ID)) ,]
NCI_60_metabolites_common <- NCI_60_metabolites[str_detect(NCI_60_metabolites$Annotation.ID,
paste(c(Common_Metabolites$HMDB,Common_Metabolites$KEGG), collapse = "|")),]
tt1=NCI_60_metabolites_common$Annotation.ID[str_detect(NCI_60_metabolites_common$Annotation.ID,"^C")]
NCI_60_metabolites_common$Annotation.ID[str_detect(NCI_60_metabolites_common$Annotation.ID,"^C")] <- Metabolite_mapping[Metabolite_mapping$KEGG %in% tt1,] %>% .[match(tt1, .$KEGG),] %>% pull(HMDB)
NCI_60_metabolites_common <- NCI_60_metabolites_common%>%
subset(!duplicated(Annotation.ID)) %>%
column_to_rownames("Annotation.ID")
CCLE_Met_Common <- Metabolites[rownames(Metabolites) %in% Common_Metabolites$Query,]
rownames(CCLE_Met_Common) <- Common_Metabolites$HMDB[match(rownames(CCLE_Met_Common),Common_Metabolites$Query)]
omics <- list(Transcriptome = Dataset_correlation(RNA_seq,NCI_60_RNA),
Proteome = Dataset_correlation(CCLE_proteins,NCI_60_proteins),
Metabolome = Dataset_correlation(CCLE_Met_Common,NCI_60_metabolites_common))Proteomics heatmap: Purine and Pyrimidine in nucleotide = opposite trends? AA metabolism: Proteosome and Ribosomal genes Other pathways might have too many subpathways for pattern to emerge TCA: Not Known pathway genes
Feature correlation
$Transcriptome
$Proteome
$Metabolome
# Essentiality correlation
### Transcriptomic Comparison ###
Essential_vs_Not_Genes <- omics$Transcriptome$Feature_Corr$Features_quart %>% mutate(Essentiality = if_else(Feature %in%
Essential_genes, "Essential", "Not_Essential"))
ggplot(Essential_vs_Not_Genes, aes(values, after_stat(density), colour = Essentiality)) +
geom_freqpoly(binwidth = 0.1) + ggtitle(paste("mRNA essentiality Correlation"))Feature Gene expression Corr
$Transcriptome
$Proteome
$Metabolome
1 is the bottom quartile and 4 the top quartile
Heatmaps
$Transcriptome
$Proteome
$Metabolome
Here we see that the proteins might actually be more biologically relevant than the mRNA and the 2 datasets are comparable. The CCLE proteome has more NAs but this is not shown here due to ‘pairwise’ correlation. But when the data is present, the comparison is good
Transcript-Proteom Heatmap
Genes_Matrix <- omics$Transcriptome$Heatmap$Dataset1_matri
Proteins_Matrix <- omics$Proteome$Heatmap$Dataset1_matri
Gene_mapping <- HUMAN_9606_idmapping %>%
subset((Type == "Gene_Name") & (Uniprot %in% rownames(Proteins_Matrix)),-2) %>%
subset(!(duplicated(Uniprot) | duplicated(ID))) %>%
.[match(rownames(Proteins_Matrix),.$Uniprot),] %>% .$ID
Proteins_Matrix <- Proteins_Matrix %>%
magrittr::set_rownames(Gene_mapping) %>%
magrittr::set_colnames(Gene_mapping)
common_genes <- intersect(rownames(Genes_Matrix), rownames(Proteins_Matrix))
Master_pathways <- Master_pathways[!duplicated(Master_pathways$ID),]
Master_pathways_ordered <- Master_pathways[Master_pathways$ID %in% common_genes,] %>% .[order(.$Pathway),]# %>%
#left_join(humankegg_df, by = c("ID" = "Uniprot")) %>% subset(!duplicated(ID))
#left_join(Gene_pathways[,c("Uniprot","SubPathway")], by = c("ID" = "Uniprot")) %>% subset(!duplicated(ID))
Master_pathways_ordered[is.na(Master_pathways_ordered)] <- "Not Known"
common_genes <- common_genes[match(Master_pathways_ordered$ID, common_genes)]
Genes_matrix <- Genes_Matrix %>%
.[rownames(.) %in% common_genes,colnames(.) %in% common_genes] %>%
.[match(common_genes,rownames(.)),match(common_genes,colnames(.))]
Proteins_Matrix <- Proteins_Matrix %>%
.[rownames(.) %in% common_genes,colnames(.) %in% common_genes] %>%
.[match(common_genes,rownames(.)),match(common_genes,colnames(.))]
Combined_matrix <- as.data.frame(Genes_matrix+t(Proteins_Matrix)) %>% as.matrix()
set.seed(1587) # to set random generator seed
cl <- colors(distinct = TRUE)
Master_Pathway_colours <- sample(cl, length(unique(Master_pathways_ordered$Pathway))) %>% setNames(unique(Master_pathways_ordered$Pathway))
#Sub_Pathway_colours <- sample(cl, length(unique(Master_pathways_ordered$SubPathway))) %>% setNames(unique(Master_pathways_ordered$SubPathway))
Combined_matrix[Combined_matrix ==2] <- 1
ha = HeatmapAnnotation(Master_path = Master_pathways_ordered$Pathway,
#Sub_path = Master_pathways_ordered$SubPathway,
col = list(Master_path = Master_Pathway_colours))#,
#Sub_path = Sub_Pathway_colours))
side = rowAnnotation(Master_path = Master_pathways_ordered$Pathway,
#Sub_path = Master_pathways_ordered$SubPathway,
col = list(Master_path = Master_Pathway_colours))#,
#Sub_path = Sub_Pathway_colours))
hmap <- Heatmap(Combined_matrix, #name = Comparison_type,
top_annotation = ha ,
left_annotation = side,
cluster_rows = F, cluster_columns = F, show_column_names = F, show_row_names = F,
row_title = paste0("top = ", "Transcriptome"),
column_title = paste0("bottom = ", "Proteome"),
column_title_side = "bottom")
hmap